clear
set more off
macro drop all
capture log close

/********************************************************************************
Discrimination in Multi-Phase Systems: Evidence from Child Protection
Clean Workers Data

Created on: 8/29/18

Last Modified on: 2/20/2024

Description: This file cleans the raw workers data.

Note that we have removed the file directory names from this program for 
confidentiality reasons.
********************************************************************************/

** Setting the Directory
global rawdata
global cleandata 
global tmp 

/********************************************************************************

The raw workers data should be unique at the case*child*worker*start date
level, although there are some duplicates.  Eventually, I'll want to use a cleaned
version of this data to construct a child*case level dataset so I need to make the
raw data more manageable to work with first by create a file which is unique at
the case*child*worker*start date level.

There are 2 sections of this do file:
1) Clean Variables
2) Collapse to Child*Case*Worker*Start Date level

*******************************************************************************/

*************************
**1) CLEAN VARIABLES
*************************

**Load raw workers data
use "${rawdata}workers.dta", clear

**Rename and label variables

rename intakechildvicpartyid vicid
la var vicid "Child Victim ID"

la var caseid "Case ID: Either Inv Caseid or Ongoing Caseid"

rename srccaseid src_caseid
la var src "SRC Case ID"

****note, Andrew told me that the employee_id field is *more* unique than the 
****emp_party_id field in that the latter represents unique workers while the former
****might change if a worker changes offices or takes a leave and then returns.
****For this reason, I only use the emp_party_id variable.
drop employee_id

rename emp_party_id worker_id
la var worker_id "DHHS Worker ID"

gen caseworker=(jobfuncdesc2=="CPS Worker")
la var caseworker "CPS Caseworker"

gen supervisor=(jobfuncdesc2=="CPS Supervisor")
la var supervisor "CPS Supervisor"

gen manager=(jobfuncdesc2=="CPS Manager")
la var manager "CPS Manager"

drop jobfuncdesc2

gen worker_role=.
replace worker_role=1 if assign=="Primary"
replace worker_role=2 if assign=="Secondary"
replace worker_role=3 if assign=="Courtesy"
la var worker_role "Role of Worker"
la def role 1 "Primary" 2 "Secondary" 3 "Courtesy"
la val worker_role role
drop assign

rename agency_type worker_agency
la var worker_agency "Type of Agency of Worker"

gen worker_startdate=date(cse_asgmnt_start_dt, "YMD")
format worker_startdate %d
la var worker_startdate "Date Worker was Assigned to Case"

gen worker_enddate=date(cse_asgmnt_end_dt, "YMD")
format worker_enddate %d
la var worker_enddate "Date Worker was Removed from Case"
drop cse*

la var agency_name "Agency Name"

duplicates drop
sort vicid caseid worker_id worker_startdate

*************************
**2) COLLAPSE TO CHILD*CASE*WORKER*START DATE LEVEL
*************************

duplicates report vicid caseid worker_id worker_startdate

/*
--------------------------------------
   copies | observations       surplus
----------+---------------------------
        1 |     17873505             0
        2 |       226960        113480
        3 |         7512          5008
        4 |         2192          1644
        5 |          205           164
        6 |           36            30
--------------------------------------
*/


duplicates tag vicid caseid worker_id worker_startdate, gen(dups)

**The 2 main reasons for duplicates are conflicting information on worker roles and 
**conflicting end date information. For the roles, assume the hierarchy is primary,
**secondary and courtesy and give priority to the higher position on the hierarchy.
**For end date, keep the earliest end date.

********(a) Worker Role
sort vicid caseid worker_id worker_startdate worker_role
bysort vicid caseid worker_id worker_startdate: gen role_flag=1 if worker_role[1]!=worker_role[_N] & dups>0
bysort vicid caseid worker_id worker_startdate: egen role_min=min(worker_role) if role_flag==1
replace worker_role=role_min if role_flag==1
drop role_min role_flag

********(b) End date
sort vicid caseid worker_id worker_startdate worker_enddate
bysort vicid caseid worker_id worker_startdate: gen end_flag=1 if worker_enddate[1]!=worker_enddate[_N] & dups>0
bysort vicid caseid worker_id worker_startdate: egen end_min=min(worker_enddate) if end_flag==1
replace worker_enddate=end_min if end_flag==1 & end_min!=.
drop end_flag end_min dups

duplicates drop 

duplicates report vicid caseid worker_id worker_startdate

/*
--------------------------------------
   copies | observations       surplus
----------+---------------------------
        1 |     17979696             0
        2 |        20746         10373
        3 |           45            30
--------------------------------------

*/

duplicates tag vicid caseid worker_id worker_startdate, gen(dups)

**There are still some duplicate observations.  These are a result of conflicting
**job function information.  Assume the hierarchy is supervisor, manager and caseworker
**and give priority to the higher position on the hierarchy.
gen jobfunc=.
replace jobfunc=1 if caseworker==1
replace jobfunc=2 if manager==1
replace jobfunc=3 if supervisor==1
drop caseworker supervisor manager
sort vicid caseid worker_id worker_startdate jobfunc
bysort vicid caseid worker_id worker_startdate: gen job_flag=1 if jobfunc[1]!=jobfun[_N] & dups>0
bysort vicid caseid worker_id worker_startdate: egen job_max=max(jobfunc) if dups>0
replace jobfunc=job_max if job_flag==1
drop job_max job_flag dups
la def j 1 "Caseworker" 2 "Manager" 3 "Supervisor"
la val jobfunc j
la var jobfunc "Worker Job"
rename jobfunc worker_job

duplicates drop

duplicates report vicid caseid worker_id worker_startdate
/*
--------------------------------------
   copies | observations       surplus
----------+---------------------------
        1 |     17985824             0
        2 |         8516          4258
        3 |            6             4
--------------------------------------

*/

duplicates tag vicid caseid worker_id worker_startdate, gen(dups)

**The only remaining duplicates are conflicts with the type of agency that employed
**the worker and the name of the agency.  Going by sheer percentages, let's prioritize 
**the agencies that occur most often in the data: County Office, Central Intake, 
**Private Agency, Central Office and Residential.  For agency name, there is no 
**real way to know which is correct so just choose randomly.

********(a) Agency Type
gen agency=.
replace agency=1 if worker_agency=="DHS Residential"
replace agency=2 if worker_agency=="Central Office"
replace agency=3 if worker_agency=="Private Agency"
replace agency=4 if worker_agency=="Central Intake"
replace agency=5 if worker_agency=="DHS County Office"
drop worker_agency
sort vicid caseid worker_id worker_startdate agency
bysort vicid caseid worker_id worker_startdate: gen ag_flag=1 if agency[1]!=agency[_N] & dups>0
bysort vicid caseid worker_id worker_startdate: egen ag_max=max(agency) if dups>0
replace agency=ag_max if ag_flag==1
drop ag_max ag_flag
rename agency worker_agency_type
la def ag 1 "DHS Residential" 2 "Central Office" 3 "Private Agency" 4 "Central Intake" ///
	5 "DHS County Office"
la val worker_agency_type ag
la var worker_agency_type "Worker Agency Type"


********(b) Agency Name
bysort vicid caseid worker_id worker_startdate: gen ag_flag=1 if agency_name[1]!=agency_name[_N] & dups>0
gen random=runiform() if ag_flag==1
bysort vicid caseid worker_id worker_startdate: egen rand_min=min(random) if random!=.
gen agency_name_random=agency_name if random==rand_min
replace agency_name="" if ag_flag==1
bysort vicid caseid worker_id worker_startdate: egen agency_name_mode=mode(agency_name_random) if ag_flag==1
replace agency_name=agency_name_mode if ag_flag==1
drop dups ag_flag random rand_min agency_name_random agency_name_mode 

duplicates drop

duplicates report vicid caseid worker_id worker_startdate

**Save clean workers data
compress
sort vicid caseid worker_id worker_startdate
save "${cleandata}workers_clean.dta", replace

*************************
**3) COLLAPSE TO CHILD*CASE LEVEL
*************************

**Load clean workers data
use "${cleandata}workers_clean.dta", clear

**Keep only the cases handled by DHS county offices
keep if worker_agency==5

**Calculate the amount of time that each worker spent on each case
gen days_worked=worker_enddate-worker_startdate
replace days_worked=0 if days_worked<0

**Identify the first day a case was worked on
bysort vicid caseid: egen startdate=min(worker_startdate)
format startdate %td

**Drop workers who were removed from the case the same day they started it
drop if days_worked==0

**Drop workers who were assigned on the first day of the case but removed the next day
drop if worker_startdate==startdate & days_worked==1

**Now, limit the dataset to only the workers who were assigned within the first few days
**of the first day of the case 
********note: in the scenario that there are no assigned workers within 5 days of the
********start date, then keep the whole log of assigned workers
gen flag=1 if startdate+5>=worker_startdate
bysort vicid caseid: egen flag_max=max(flag)
keep if flag==1 | flag_max==0

**If there was only one worker assigned in this 5 day window, then by default, they 
**must be the first assigned worker. But if there is more than one, then I need to
**figure out which was the first assigned investigator based off of job title (caseworker
**vs supervisor) and days worked.
bysort caseid vicid: gen nworkers=_N

*****************************
**SECTION 1: Deals with cases where there is either only 1 worker or there is a 
**clear primary investigator
*****************************

gen first=0

********1a) Only 1 worker in window
replace first=1 if nworkers==1 
bysort vicid caseid: egen first_max=max(first)

*******1b) More than 1 worker, but only 1 is an investigator				
gen investigator=(worker_job==1)
bysort caseid vicid: egen ninvestigators=total(investigator)
replace first=1 if investigator==1 & ninvestigators==1 & first_max==0
drop first_max
bysort vicid caseid: egen first_max=max(first)

*******1c) More than 1 caseworker, but only 1 is the primary			
gen primary_inv=(worker_role==1 & investigator==1)
bysort caseid vicid: egen nprimary_inv=total(primary_inv)
replace first=1 if primary_inv==1 & nprimary_inv==1 & first_max==0
drop first_max
bysort vicid caseid: egen first_max=max(first)

*******1d) More than 1 primary caseworker-- choose the one who was first assigned the case
bysort vicid caseid primary_inv: egen date_min=min(worker_startdate)
gen first_startdate=1 if worker_startdate==date_min
bysort vicid caseid primary_inv: egen total_first_startdate=total(first_startdate)
replace first=1 if nprimary_inv>1 & primary_inv==1 & worker_startdate==date_min & total_first_startdate==1 & first_max==0
drop first_max
bysort vicid caseid: egen first_max=max(first)			

******1e) But if more than one primary caseworkers were assigned the case on the same day, then choose the one
******	 who worked the case for the longest
bysort vicid caseid primary_inv worker_startdate: egen days_max=max(days_worked)
gen most_days=1 if days_worked==days_max
bysort vicid caseid primary_inv worker_startdate: egen total_most_days=total(most_days)
replace first=1 if nprimary_inv>1 & primary_inv==1 & worker_startdate==date_min & most_days==1 & total_most_days==1 & first_max==0
drop first_max
bysort vicid caseid: egen first_max=max(first)

******1f) If there is more than one primary caseworker assigned on the same day who both
******   worked for the same amount of time on the case together, then choose randomly
******   among them.
set seed 12345
gen random=runiform()
bysort vicid caseid: egen random_max=max(random) if primary_inv==1 & first_max!=1
replace first=1 if random==random_max & primary_inv==1 & first_max==0
drop first_max
bysort vicid caseid: egen first_max=max(first)

save "$tmp/find_first_worker.dta", replace

**note: at this point, nearly all (3,326,601/3,363,217) child*cases are assigned a first worker.
**so make the dataset smaller for the last few.
preserve
keep if first==1
save "$tmp/has_first_worker.dta", replace
restore

preserve 
drop if first_max==1
save "$tmp/no_first_worker.dta", replace
restore

*****************************
**SECTION 2: Deals with cases where there is not a clear primary investigator
*****************************
use "$tmp/no_first_worker.dta", clear

**2a) If there are any caseworkers (secondary or courtesy), assign to the one who was 
**   first assigned the case or, if there are ties, the one who worked for the longest.
bysort vicid caseid investigator: egen startdate_min=min(worker_startdate)
gen start_is_min=(worker_startdate==startdate_min)
bysort vicid caseid investigator worker_startdate: egen start_is_min_total=total(start_is_min)
replace first=1 if investigator==1 & worker_startdate==startdate_min & start_is_min_total==1 & first_max==0
drop first_max
bysort vicid caseid: egen first_max=max(first)
drop most_days days_max total_most_days

bysort vicid caseid investigator worker_startdate: egen days_max=max(days_worked)
gen most_days=1 if days_worked==days_max
bysort vicid caseid investigator worker_startdate: egen total_most_days=total(most_days)
replace first=1 if investigator==1 & worker_startdate==startdate_min & start_is_min_total>1 & ///
	days_worked==days_max & total_most_days==1 & first_max==0
drop first_max
bysort vicid caseid: egen first_max=max(first)

**(2b) If there is more than one caseworker who are assigned on the first day of the case and work
**    on the case for the same amount of time, choose randomly among them
drop random_max
bysort vicid caseid investigator worker_startdate days_worked: egen random_max=max(random)
replace first=1 if investigator==1 & worker_startdate==startdate_min & days_worked==days_max ///
	& random==random_max & first_max==0
drop first_max
bysort vicid caseid: egen first_max=max(first)

**(2c) If there are no invenstigators, assign to the primary worker who was first assigned the
**    cases or, if there are ties, the one who worked for the longest
drop startdate_min start_is_min* days_max most_days total_most_days
gen primary=1 if worker_role==1
bysort vicid caseid primary: egen startdate_min=min(worker_startdate)
gen start_is_min=(worker_startdate==startdate_min)
bysort vicid caseid primary worker_startdate: egen start_is_min_total=total(start_is_min)
replace first=1 if primary==1 & worker_startdate==startdate_min & start_is_min_total==1 & first_max==0
drop first_max
bysort vicid caseid: egen first_max=max(first)

bysort vicid caseid primary worker_startdate: egen days_max=max(days_worked)
gen most_days=1 if days_worked==days_max
bysort vicid caseid primary worker_startdate: egen total_most_days=total(most_days)
replace first=1 if primary==1 & worker_startdate==startdate_min & start_is_min_total>1 & ///
	days_worked==days_max & total_most_days==1 & first_max==0
drop first_max
bysort vicid caseid: egen first_max=max(first)

**(2d) If there is more than one primary worker assigned on the first day of the case and work
**    on the case for the same amount of time, choose randomly among them
drop random_max
bysort vicid caseid primary worker_startdate days_worked: egen random_max=max(random)
replace first=1 if primary==1 & worker_startdate==startdate_min & days_worked==days_max ///
	& random==random_max & first_max==0
drop first_max
bysort vicid caseid: egen first_max=max(first)

**(2e) If there are no primary workers, assign to the worker who was first assigned the case or,
**    if there are ties, the one who worked for the longest.
replace first=1 if primary!=1 & worker_startdate==startdate_min & start_is_min_total==1 & first_max==0
drop first_max
bysort vicid caseid: egen first_max=max(first)

replace first=1 if primary!=1 & worker_startdate==startdate_min & start_is_min_total>1 & ///
	days_worked==days_max & total_most_days==1 & first_max==0
drop first_max
bysort vicid caseid: egen first_max=max(first)

**(2f) If there is more than one primary worker assigned on the first day of the case and work
**    on the case for the same amount of time, choose randomly among them
replace first=1 if primary!=1 & worker_startdate==startdate_min & days_worked==days_max ///
	& random==random_max & first_max==0
drop first_max
bysort vicid caseid: egen first_max=max(first)

**Now, every case*child should be assigned a first investigator, so append to the 
**temp file storing the other first investigations
keep if first==1
gzappend using $tmp/has_first_worker.dta.gz
keep vicid caseid src_caseid worker_id worker_job agency_name
rename agency_name worker_county
tab worker_job, m

**Save child*case level file
sort vicid caseid
compress
save "${cleandata}workers_child_case_level.dta", replace



